#Thiophosphorylation analysis for Replicates 1, 2, and 3
#Exp21+24+26 analysis on peptide group level SILAC Replicate 1, 2, and 3
#Eluate and Supernatant samples were analyzied in 1 PD analysis for Rep 1, 2, and 3. (eluate is covalently captured thiophosphopeptides that were eluted, supernant is unbound sample)
#Missing abundance values were filled using data imputation on PD, randomly assigning the bottom 5% abundance values to these missing ones.
#Imputed values are only assigned to peptides in which it was confidently ID'd and

#load libraries
library(ggplot2)
library(dplyr)
library(tidyr)
library(readr)
library(purrr)
library(tibble)
library(stringr)
library(forcats)
library(plotly)
library(naniar)
library(knitr)
library(ggrepel)
library(matrixStats)
library(ComplexHeatmap)
library(circlize)
library(hypeR)



## themes
## color palette
cbPalette <- c("#999999", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")

## theme
aliceTheme <-   theme(plot.title = element_text(family = "Helvetica",
                                                color = "black",
                                                face = "plain"),
                      axis.title.x = element_text(family = "Helvetica",
                                                  color = "black",
                                                  face = "plain"),
                      axis.title.y = element_text(family = "Helvetica",
                                                  color = "black"),
                      axis.text = element_text(family = "Helvetica",
                                               color = "black"),
                      legend.text = element_text(family = "Helvetica", size = 12, color = "black"),
                      legend.text.align = 1,
                      axis.ticks = element_line(color = "black"),
                      panel.grid = element_blank(),
                      panel.background = element_blank(),
                      panel.border = element_rect(linetype = "solid", 
                                                  color = "black",
                                                  size = 1,
                                                  fill = NA)
)


#### Thiophosphorylation data ####

#set working directory
setwd("~/Desktop/2022_05_09_Publication_ThioP_Analysis_R1_R2_R3")

DF.Peptides <- read.csv2("20191007_Exp21_24_26_ThioP_SILAC_Rep1_Rep2_Rep3_Analysis_3_Impute_ttest-(1)_PeptidesAll_2020_04_21.csv", sep = ",", header = T, fill = T, stringsAsFactors = F)
DF.Peptides$Master.Protein.Accessions <- sub("-t26_1-p1","",DF.Peptides$Master.Protein.Accessions)

#Import Protein level information (NO IMPUTATION in PD anlaysis) Will be useful down the line to look at how hits compare to their overall protein abundance for H vs L
DF.Proteins <- read.csv2("20191116_Exp21_24_26_ThioP_SILAC_Rep1_Rep2_Rep3_Analysis_5_NoImput_ttest_ProteinsAll.csv", sep = ",", header = T, fill = T, stringsAsFactors = F)
DF.Proteins$Accession <- sub("-t26_1-p1","",DF.Proteins$Accession)
#convert columns to numeric

#generate new column with abundances and imputed values as NA
#Rep1-3 eluate 1M ie Light
DF.Peptides$Abund.NoImp.elu.L.R1 <- NA
DF.Peptides$Abund.NoImp.elu.L.R2 <- NA
DF.Peptides$Abund.NoImp.elu.L.R3 <- NA
for (i in 1:nrow(DF.Peptides))
{
  if(DF.Peptides$Abundances.Origin..F5..Light..Sample..1..light..eluate..1M[i] == "Det")
  {
    DF.Peptides$Abund.NoImp.elu.L.R1[i] <- as.numeric(DF.Peptides$Abundances..by.Bio..Rep....light..eluate..1M.1[i])
  } else {
    DF.Peptides$Abund.NoImp.elu.L.R1[i] <- NA
  }
}
for (i in 1:nrow(DF.Peptides))
{
  if(DF.Peptides$Abundances.Origin..F1..Light..Sample..2..light..eluate..1M[i] == "Det")
  {
    DF.Peptides$Abund.NoImp.elu.L.R2[i] <- as.numeric(DF.Peptides$Abundances..by.Bio..Rep....light..eluate..1M.2[i])
  } else {
    DF.Peptides$Abund.NoImp.elu.L.R2[i] <- NA
  }
}
for (i in 1:nrow(DF.Peptides))
{
  if(DF.Peptides$Abundances.Origin..F7..Light..Sample..3..light..eluate..1M[i] == "Det")
  {
    DF.Peptides$Abund.NoImp.elu.L.R3[i] <- as.numeric(DF.Peptides$Abundances..by.Bio..Rep....light..eluate..1M.3[i])
  } else {
    DF.Peptides$Abund.NoImp.elu.L.R3[i] <- NA
  }
}
#Rep10-3 eluate dKu80 ie Heavy
DF.Peptides$Abund.NoImp.elu.H.R1 <- NA
DF.Peptides$Abund.NoImp.elu.H.R2 <- NA
DF.Peptides$Abund.NoImp.elu.H.R3 <- NA

for (i in 1:nrow(DF.Peptides))
{
  if(DF.Peptides$Abundances.Origin..F5..Heavy..Sample..1..heavy..eluate..dKu80[i] == "Det")
  {
    DF.Peptides$Abund.NoImp.elu.H.R1[i] <- as.numeric(DF.Peptides$Abundances..by.Bio..Rep....heavy..eluate..dKu80.1[i])
  } else {
    DF.Peptides$Abund.NoImp.elu.H.R1[i] <- NA
  }
}
for (i in 1:nrow(DF.Peptides))
{
  if(DF.Peptides$Abundances.Origin..F1..Heavy..Sample..2..heavy..eluate..dKu80[i] == "Det")
  {
    DF.Peptides$Abund.NoImp.elu.H.R2[i] <- as.numeric(DF.Peptides$Abundances..by.Bio..Rep....heavy..eluate..dKu80.2[i])
  } else {
    DF.Peptides$Abund.NoImp.elu.H.R2[i] <- NA
  }
}
for (i in 1:nrow(DF.Peptides))
{
  if(DF.Peptides$Abundances.Origin..F7..Heavy..Sample..3..heavy..eluate..dKu80[i] == "Det")
  {
    DF.Peptides$Abund.NoImp.elu.H.R3[i] <- as.numeric(DF.Peptides$Abundances..by.Bio..Rep....heavy..eluate..dKu80.3[i])
  } else {
    DF.Peptides$Abund.NoImp.elu.H.R3[i] <- NA
  }
}

#Rep1-3 SUPERNATANT 1M ie Light
DF.Peptides$Abund.NoImp.sup.L.R1 <- NA
DF.Peptides$Abund.NoImp.sup.L.R2 <- NA
DF.Peptides$Abund.NoImp.sup.L.R3 <- NA
for (i in 1:nrow(DF.Peptides))
{
  if(DF.Peptides$Abundances.Origin..F6..Light..Sample..1..light..supernatant..1M[i] == "Det")
  {
    DF.Peptides$Abund.NoImp.sup.L.R1[i] <- as.numeric(DF.Peptides$Abundances..by.Bio..Rep....light..supernatant..1M.1[i])
  } else {
    DF.Peptides$Abund.NoImp.sup.L.R1[i] <- NA
  }
}
for (i in 1:nrow(DF.Peptides))
{
  if(DF.Peptides$Abundances.Origin..F4..Light..Sample..2..light..supernatant..1M[i] == "Det")
  {
    DF.Peptides$Abund.NoImp.sup.L.R2[i] <- as.numeric(DF.Peptides$Abundances..by.Bio..Rep....light..supernatant..1M.2[i])
  } else {
    DF.Peptides$Abund.NoImp.sup.L.R2[i] <- NA
  }
}
for (i in 1:nrow(DF.Peptides))
{
  if(DF.Peptides$Abundances.Origin..F8..Light..Sample..3..light..supernatant..1M[i] == "Det")
  {
    DF.Peptides$Abund.NoImp.sup.L.R3[i] <- as.numeric(DF.Peptides$Abundances..by.Bio..Rep....light..supernatant..1M.3[i])
  } else {
    DF.Peptides$Abund.NoImp.sup.L.R3[i] <- NA
  }
}
#Rep1-3 SUPERNATANT dKu80 ie Heavy
DF.Peptides$Abund.NoImp.sup.H.R1 <- NA
DF.Peptides$Abund.NoImp.sup.H.R2 <- NA
DF.Peptides$Abund.NoImp.sup.H.R3 <- NA
for (i in 1:nrow(DF.Peptides))
{
  if(DF.Peptides$Abundances.Origin..F6..Heavy..Sample..1..heavy..supernatant..dKu80[i] == "Det")
  {
    DF.Peptides$Abund.NoImp.sup.H.R1[i] <- as.numeric(DF.Peptides$Abundances..by.Bio..Rep....heavy..supernatant..dKu80.1[i])
  } else {
    DF.Peptides$Abund.NoImp.sup.H.R1[i] <- NA
  }
}
for (i in 1:nrow(DF.Peptides))
{
  if(DF.Peptides$Abundances.Origin..F4..Heavy..Sample..2..heavy..supernatant..dKu80[i] == "Det")
  {
    DF.Peptides$Abund.NoImp.sup.H.R2[i] <- as.numeric(DF.Peptides$Abundances..by.Bio..Rep....heavy..supernatant..dKu80.2[i])
  } else {
    DF.Peptides$Abund.NoImp.sup.H.R2[i] <- NA
  }
}
for (i in 1:nrow(DF.Peptides))
{
  if(DF.Peptides$Abundances.Origin..F8..Heavy..Sample..3..heavy..supernatant..dKu80[i] == "Det")
  {
    DF.Peptides$Abund.NoImp.sup.H.R3[i] <- as.numeric(DF.Peptides$Abundances..by.Bio..Rep....heavy..supernatant..dKu80.3[i])
  } else {
    DF.Peptides$Abund.NoImp.sup.H.R3[i] <- NA
  }
}
#### DF for supernatant ####
#DF.sup will contain peptides from supernatant in Rep 1 (F6), Rep 2 (F4), and Rep 3 (F8)
#Subset peptides based on whether or not a peak was at least found in 1/3 replicates
DF.sup <- DF.Peptides[DF.Peptides$Found.in.File...F6. == "High" |
                        DF.Peptides$Found.in.File...F6. == "Peak Found" |
                        DF.Peptides$Found.in.File...F4. == "High" |
                        DF.Peptides$Found.in.File...F4. == "Peak Found" |
                        DF.Peptides$Found.in.File...F8. == "High" |
                        DF.Peptides$Found.in.File...F8. == "Peak Found",]

#### Protein level analysis for PD analysis WITHOUT IMPUTATION ####
#DF for proteins
#DF.Proteins will containt peptides from supernatant in Rep 1 (F6), Rep 2(F4), and Rep 3 (F8). 
#subset proteins based on whether or not a peak was at leastt found in 1/3 replicates
DF.Proteins <- DF.Proteins[DF.Proteins$Found.in.File...F6. == "High" |
                             DF.Proteins$Found.in.File...F6. == "Peak Found" |
                             DF.Proteins$Found.in.File...F4. == "High" |
                             DF.Proteins$Found.in.File...F4. == "Peak Found" |
                             DF.Proteins$Found.in.File...F8. == "High" |
                             DF.Proteins$Found.in.File...F8. == "Peak Found",]
#Determine normalized abudance ratios for SUPERNATANT by dividing by median abundance ratio for each replicate, exclude NAs
library(matrixStats)
DF.Proteins$Norm.Abund.Ratio.R1 <- as.numeric(DF.Proteins$Abundance.Ratios..by.Bio..Rep.....heavy..supernatant..dKu80.1.....light..supernatant..1M.1.)/median(as.numeric(DF.Proteins$Abundance.Ratios..by.Bio..Rep.....heavy..supernatant..dKu80.1.....light..supernatant..1M.1.), na.rm = TRUE)
DF.Proteins$Norm.Abund.Ratio.R2 <- as.numeric(DF.Proteins$Abundance.Ratios..by.Bio..Rep.....heavy..supernatant..dKu80.2.....light..supernatant..1M.2.)/median(as.numeric(DF.Proteins$Abundance.Ratios..by.Bio..Rep.....heavy..supernatant..dKu80.2.....light..supernatant..1M.2.), na.rm = TRUE)
DF.Proteins$Norm.Abund.Ratio.R3 <- as.numeric(DF.Proteins$Abundance.Ratios..by.Bio..Rep.....heavy..supernatant..dKu80.3.....light..supernatant..1M.3.)/median(as.numeric(DF.Proteins$Abundance.Ratios..by.Bio..Rep.....heavy..supernatant..dKu80.3.....light..supernatant..1M.3.), na.rm = TRUE)
DF.Proteins$Avg.Norm.Abund.Ratio <- rowMeans(DF.Proteins[,c("Norm.Abund.Ratio.R1", "Norm.Abund.Ratio.R2", "Norm.Abund.Ratio.R3")], na.rm=TRUE)
DF.Proteins <- DF.Proteins[complete.cases(DF.Proteins),]

DF.Proteins$med.Norm.Abund.Ratio <- NA
for (i in 1:nrow(DF.Proteins)) {
  proteins.matrix <- as.matrix(DF.Proteins[i,c("Norm.Abund.Ratio.R1", "Norm.Abund.Ratio.R2", "Norm.Abund.Ratio.R3")])
  DF.Proteins$med.Norm.Abund.Ratio[i] <- rowMedians(proteins.matrix, na.rm = TRUE)
}

DF.Proteins$MedAbsStdDev.Norm.Abund.Ratio <- NA
for (i in 1:nrow(DF.Proteins)) {
  proteins.matrix <- as.matrix(DF.Proteins[i,c("Norm.Abund.Ratio.R1", "Norm.Abund.Ratio.R2", "Norm.Abund.Ratio.R3")])
  DF.Proteins$MedAbxsStdDev.Norm.Abund.Ratio[i] <- rowMads(proteins.matrix, na.rm = TRUE)
}

DF.Proteins$Var.Norm.Abund.Ratio <- DF.Proteins$MedAbsStdDev.Norm.Abund.Ratio / DF.Proteins$med.Norm.Abund.Ratio

#Plot distribution pre and post normalization of supernatant protein abundances
#Rep1 proteins sup
plot(density(log2(as.numeric(DF.Proteins$Abundance.Ratios..by.Bio..Rep.....heavy..supernatant..dKu80.1.....light..supernatant..1M.1.)), na.rm = TRUE))
lines(density(log2(DF.Proteins$Norm.Abund.Ratio.R1), na.rm = TRUE), col= "red")
#Rep2 proteins sup
plot(density(log2(as.numeric(DF.Proteins$Abundance.Ratios..by.Bio..Rep.....heavy..supernatant..dKu80.2.....light..supernatant..1M.2.)), na.rm = TRUE))
lines(density(log2(DF.Proteins$Norm.Abund.Ratio.R2), na.rm = TRUE), col= "red")
#Rep3 proteins sup
plot(density(log2(as.numeric(DF.Proteins$Abundance.Ratios..by.Bio..Rep.....heavy..supernatant..dKu80.3.....light..supernatant..1M.3.)), na.rm = TRUE))
lines(density(log2(DF.Proteins$Norm.Abund.Ratio.R3), na.rm = TRUE), col= "red")

#### Normalization factor calculation for peptides ####
#Calculate the median peptide abundance ratios of supernatant to determine normalization factors for each replicate
#Calculate the median abundance ratio for each replicate
#Define empty cells as NA and exclude them from median calculation
#Compute median, na.rm= TRUE tells the function to exclude NA values when calculating the median. Median value is different if you don't remove NA values.
#Rep 1 median abundance ratio
DF.sup$Abundance.Ratios..by.Bio..Rep.....heavy..supernatant..dKu80.1.....light..supernatant..1M.1.[DF.sup$Abundance.Ratios..by.Bio..Rep.....heavy..supernatant..dKu80.1.....light..supernatant..1M.1.==""] <- NA
R1.sup.median <- median(as.numeric(DF.sup$Abundance.Ratios..by.Bio..Rep.....heavy..supernatant..dKu80.1.....light..supernatant..1M.1.),na.rm = TRUE)
#Rep 2 median abundance ratio
DF.sup$Abundance.Ratios..by.Bio..Rep.....heavy..supernatant..dKu80.2.....light..supernatant..1M.2.[DF.sup$Abundance.Ratios..by.Bio..Rep.....heavy..supernatant..dKu80.2.....light..supernatant..1M.2.==""] <- NA
R2.sup.median <- median(as.numeric(DF.sup$Abundance.Ratios..by.Bio..Rep.....heavy..supernatant..dKu80.2.....light..supernatant..1M.2.),na.rm = TRUE)
#Rep 3 median abundance ratio
DF.sup$Abundance.Ratios..by.Bio..Rep.....heavy..supernatant..dKu80.3.....light..supernatant..1M.3.[DF.sup$Abundance.Ratios..by.Bio..Rep.....heavy..supernatant..dKu80.3.....light..supernatant..1M.3.==""] <- NA
R3.sup.median <- median(as.numeric(DF.sup$Abundance.Ratios..by.Bio..Rep.....heavy..supernatant..dKu80.3.....light..supernatant..1M.3.),na.rm = TRUE)
#Median values will be used to divide abundance ratios in the eluate.
DF.sup$NormAbundRatio.sup.R1 <- as.numeric(DF.sup$Abundance.Ratios..by.Bio..Rep.....heavy..supernatant..dKu80.1.....light..supernatant..1M.1.)/R1.sup.median
DF.sup$NormAbundRatio.sup.R2 <- as.numeric(DF.sup$Abundance.Ratios..by.Bio..Rep.....heavy..supernatant..dKu80.2.....light..supernatant..1M.2.)/R2.sup.median
DF.sup$NormAbundRatio.sup.R3 <- as.numeric(DF.sup$Abundance.Ratios..by.Bio..Rep.....heavy..supernatant..dKu80.3.....light..supernatant..1M.3.)/R3.sup.median
DF.sup$Avg.NormAbundRatio.sup <- rowMeans(DF.sup[,c("NormAbundRatio.sup.R1", "NormAbundRatio.sup.R2", "NormAbundRatio.sup.R3")], na.rm=TRUE)
#No imputation. Determine Norm abundace ratio without imputation
DF.sup$AbundRatio.sup.noimput.R1 <- DF.sup$Abund.NoImp.sup.H.R1 / DF.sup$Abund.NoImp.sup.L.R1
DF.sup$AbundRatio.sup.noimput.R2 <- DF.sup$Abund.NoImp.sup.H.R2 / DF.sup$Abund.NoImp.sup.L.R2
DF.sup$AbundRatio.sup.noimput.R3 <- DF.sup$Abund.NoImp.sup.H.R3 / DF.sup$Abund.NoImp.sup.L.R3
#normalize to median abundance ratio of each replicate
R1.sup.median.noimput <-median(DF.sup$AbundRatio.sup.noimput.R1, na.rm = TRUE)
R2.sup.median.noimput <-median(DF.sup$AbundRatio.sup.noimput.R2, na.rm = TRUE)
R3.sup.median.noimput <-median(DF.sup$AbundRatio.sup.noimput.R3, na.rm = TRUE)
DF.sup$Norm.AbundRatio.sup.noimput.R1 <- DF.sup$AbundRatio.sup.noimput.R1 / R1.sup.median.noimput
DF.sup$Norm.AbundRatio.sup.noimput.R2 <- DF.sup$AbundRatio.sup.noimput.R2 / R2.sup.median.noimput
DF.sup$Norm.AbundRatio.sup.noimput.R3 <- DF.sup$AbundRatio.sup.noimput.R3 / R3.sup.median.noimput
DF.sup$Avg.Norm.AbundRatio.sup.noimput <- rowMeans(DF.sup[,c("Norm.AbundRatio.sup.noimput.R1", "Norm.AbundRatio.sup.noimput.R2", "Norm.AbundRatio.sup.noimput.R3")], na.rm=TRUE)

#### DF for eluate ####
#Create df for eluate data, subsetted with peptides that had at least a peak found in one of the replicates
#DF.elu will contain peptides from eluate in Replicate 1 (F5) Replicate 2 (F1) and Replicate 3 (F7)
DF.elu <- DF.Peptides[DF.Peptides$Found.in.File...F5. == "High" |
                        DF.Peptides$Found.in.File...F1. == "High" |
                        DF.Peptides$Found.in.File...F7. == "High",]

#Calculate abundance ratios off the raw abundances
DF.elu$AbundRatio.R1 <- as.numeric(DF.elu$Abundances..by.Bio..Rep....heavy..eluate..dKu80.1)/as.numeric(DF.elu$Abundances..by.Bio..Rep....light..eluate..1M.1)
DF.elu$AbundRatio.R2 <- as.numeric(DF.elu$Abundances..by.Bio..Rep....heavy..eluate..dKu80.2)/as.numeric(DF.elu$Abundances..by.Bio..Rep....light..eluate..1M.2)
DF.elu$AbundRatio.R3 <- as.numeric(DF.elu$Abundances..by.Bio..Rep....heavy..eluate..dKu80.3)/as.numeric(DF.elu$Abundances..by.Bio..Rep....light..eluate..1M.3)

#Remove peptides that were ID'd, but have no abundance information (these are peptides for which there is an annotated sequence, but no quantification information)
#Use complete.cases to keep peptides without NA values in abundance ratios
DF.elu <- DF.elu[complete.cases(DF.elu$AbundRatio.R1),]
DF.elu <- DF.elu[complete.cases(DF.elu$AbundRatio.R2),]
DF.elu <- DF.elu[complete.cases(DF.elu$AbundRatio.R3),]

DF.elu$AbundRatio.sup.R1 <- as.numeric(DF.elu$Abundances..by.Bio..Rep....heavy..supernatant..dKu80.1)/as.numeric(DF.elu$Abundances..by.Bio..Rep....light..supernatant..1M.1)
DF.elu$AbundRatio.sup.R2 <- as.numeric(DF.elu$Abundances..by.Bio..Rep....heavy..supernatant..dKu80.2)/as.numeric(DF.elu$Abundances..by.Bio..Rep....light..supernatant..1M.2)
DF.elu$AbundRatio.sup.R3 <- as.numeric(DF.elu$Abundances..by.Bio..Rep....heavy..supernatant..dKu80.3)/as.numeric(DF.elu$Abundances..by.Bio..Rep....light..supernatant..1M.3)

#Apply normalization for eluate
#Divide abundance ratio by the supernatant's median abundance ratio WITHOUT IMPUTATION (determined above) for each replicate
DF.elu$Norm.AbundRatio.R1 <- as.numeric(DF.elu$AbundRatio.R1)/R1.sup.median.noimput
DF.elu$Norm.AbundRatio.R2 <- as.numeric(DF.elu$AbundRatio.R2)/R2.sup.median.noimput
DF.elu$Norm.AbundRatio.R3 <- as.numeric(DF.elu$AbundRatio.R3)/R3.sup.median.noimput
DF.elu$Avg.Norm.AbundRatio <- rowMeans(DF.elu[,c("Norm.AbundRatio.R1", "Norm.AbundRatio.R2", "Norm.AbundRatio.R3")], na.rm=TRUE)

DF.elu$Norm.AbundRatio.sup.R1 <- as.numeric(DF.elu$AbundRatio.sup.R1)/R1.sup.median.noimput
DF.elu$Norm.AbundRatio.sup.R2 <- as.numeric(DF.elu$AbundRatio.sup.R2)/R2.sup.median.noimput
DF.elu$Norm.AbundRatio.sup.R3 <- as.numeric(DF.elu$AbundRatio.sup.R3)/R3.sup.median.noimput
DF.elu$Avg.Norm.AbundRatio.sup <- rowMeans(DF.elu[,c("Norm.AbundRatio.sup.R1", "Norm.AbundRatio.sup.R1", "Norm.AbundRatio.sup.R1")], na.rm=TRUE)

#calculate average normalzied abundance ratio for eluate
DF.elu$Avg.Norm.AbundRatio <- rowMeans(DF.elu[,c("Norm.AbundRatio.R1", "Norm.AbundRatio.R2", "Norm.AbundRatio.R3")], na.rm=TRUE)
lines(density(log2(DF.elu$Avg.Norm.AbundRatio)), col= "green")

#see distrubtion of normalized abundance ratios
plot(density(log2(DF.elu$Norm.AbundRatio.R1)))
lines(density(log2(DF.elu$Norm.AbundRatio.R2)), col= "red")
lines(density(log2(DF.elu$Norm.AbundRatio.R3)), col= "blue")

#log2 transform normalized abundance ratios
DF.elu$log2.Norm.AbundRatio.R1 <- log2(DF.elu$Norm.AbundRatio.R1)
DF.elu$log2.Norm.AbundRatio.R2 <- log2(DF.elu$Norm.AbundRatio.R2)
DF.elu$log2.Norm.AbundRatio.R3 <- log2(DF.elu$Norm.AbundRatio.R3)
DF.elu$log2.Avg.Norm.AbundRatio <- log2(DF.elu$Avg.Norm.AbundRatio)

DF.elu$log2.Norm.AbundRatio.sup.R1 <- log2(DF.elu$Norm.AbundRatio.sup.R1)
DF.elu$log2.Norm.AbundRatio.sup.R2 <- log2(DF.elu$Norm.AbundRatio.sup.R2)
DF.elu$log2.Norm.AbundRatio.sup.R3 <- log2(DF.elu$Norm.AbundRatio.sup.R3)
DF.elu$log2.Avg.Norm.AbundRatio.sup <- log2(DF.elu$Avg.Norm.AbundRatio.sup)


#### Calculate p-values ####
#Goal:Calculate abundance ratio p-value for H vs L using a one-tailed t-test.
#Need to create a loop to calculate a p value for each row and place the p value into a new column

#loop using one tailed t-test for top 5% to calculate p-value using replicate abundance ratios. p-value is insterted into new column "pvalue"
DF.elu$pvalue <- NA
for (i in 1:nrow(DF.elu)) {
  DF.elu$pvalue[i] <- t.test(DF.elu[i,c("Norm.AbundRatio.R1", "Norm.AbundRatio.R2", "Norm.AbundRatio.R3")], mu=1, conf.level = 0.95, alternative = "greater")$p.value
}
#for determining non-linear significance threshold 
DF.elu$pvaluexl2fc.norm.abund.ratio <- -log10(DF.elu$pvalue)*DF.elu$log2.Avg.Norm.AbundRatio



#Add features to DF.elu (PepID, Prot.ID, Phospho, No.Phospho, Annotation, Phenotype)
#No.Phospho and Phospho (true/false)
for (i in 1:nrow(DF.elu)) {
  PTM <- DF.elu$Modifications[i]
  if(grepl("Phospho",PTM))
  {
    DF.elu$No.Phospho[i] <- sub(".*([0-9])xPhospho.*","\\1",PTM)
  }
  else {
    DF.elu$No.Phospho[i] <- 0
  }
}
#Create another column designating whether phospho mod is present (check to see what max # phospho is in data)
DF.elu$phospho[DF.elu$No.Phospho > "0"] <- "TRUE"
DF.elu$phospho[DF.elu$No.Phospho == "0"] <- "FALSE"

#Peptide ID and Prot.ID
DF.elu$PepID <- paste0(DF.elu$Annotated.Sequence,DF.elu$Modifications..all.possible.sites.)
DF.elu$Prot.ID <- DF.elu$Master.Protein.Accessions
DF.elu$Unique.ID.3 <- paste0(DF.elu$No.Phospho,DF.elu$Sequence,DF.elu$Master.Protein.Accessions)

#### Merge toxo info ####
annotations <- read.csv2("GT1_Annotation_Phenotype.csv", 
                         sep = ",", header = T, fill = T, stringsAsFactors = F, colClasses="character") %>%
  rename(Prot.ID = "ID")
lopit <- read.csv2("LOPIT_raw_all.csv", 
                   sep = ",", header = T, fill = T, stringsAsFactors = F, colClasses="character") %>%
  rename(Prot.ID = "Gene_ID")
micronemeIP <- read.csv2("2022_04_04_EB_MicronemeIPs_Subset.csv", 
                         sep = ",", header = T, fill = T, stringsAsFactors = F, colClasses="character") %>%
  rename(Prot.ID = "Gene_ID")

#Creating toxoinfo which containing gene ID, annotation, phenotype, lopit designation, EB microneme hits)
toxoinfo <- left_join(annotations, lopit, by = "Prot.ID")
toxoinfo <- left_join(toxoinfo, micronemeIP, by = "Prot.ID") %>%
  select(Prot.ID, phenotype, Annotation, localisation.prediction, new, microneme) %>%
  rename(newMIC = 'new')

#Merge with DF.elu
DF.elu <- left_join(DF.elu, toxoinfo, by = "Prot.ID")
#Convert to numeric
DF.elu <- DF.elu %>% mutate_at(c(17:97), as.numeric)
DF.Proteins <- DF.Proteins %>% mutate_at(c(6:69), as.numeric) %>%
  rename(Prot.ID = "Accession")


#### Plots ####

#Scatter plot. Group abundances in DF.elu CDPK1-M vs G. Colored significantly enriched phospho-peptides and non-phospho
significantThioP <- DF.elu %>% filter(pvaluexl2fc.norm.abund.ratio > 4 & phospho == "TRUE")
significantThioP.noPhospho <- DF.elu %>% filter(pvaluexl2fc.norm.abund.ratio > 4 & phospho == "FALSE")
notsignficant.phospho <- DF.elu %>% filter(phospho == "TRUE", pvaluexl2fc.norm.abund.ratio < 4)

Scatter_Elu_GroupedAbundances <- ggplot(DF.elu,
       aes(x = log10(Abundances..Grouped...light..eluate..1M), y = log10(Abundances..Grouped...heavy..eluate..dKu80),
           text = paste0(Prot.ID, Unique.ID.3, "\n", Annotation, "\nPhenotype: ", phenotype,
                         "\nLOPIT: ", localisation.prediction))) +
  geom_point(color = cbPalette[1], shape = 16, size = 3, alpha = 0.5) +
  geom_point(data = notsignficant.phospho, color = cbPalette[3], shape = 16, size = 3, alpha = 0.5) +
  
  geom_text_repel(data = significantThioP,
  aes(x = log10(Abundances..Grouped...light..eluate..1M), 
  y = log10(Abundances..Grouped...heavy..eluate..dKu80),
  label = Prot.ID)) +
  geom_point(data = significantThioP.noPhospho, color = cbPalette[4], shape = 16, size = 3) +
  geom_point(data = significantThioP, color = cbPalette[2], shape = 16, size = 3) +
  geom_text_repel(min.segment.length = 0, data = DF.elu %>% filter(Prot.ID == "TGGT1_232830"),
                  aes(x = log10(Abundances..Grouped...light..eluate..1M), 
                      y = log10(Abundances..Grouped...heavy..eluate..dKu80),
                      label = Prot.ID)) +
  aliceTheme +
  xlim(2.5, 8) +
  ylim(2.5, 8) +
  theme(aspect.ratio = 1) +
  xlab("log10(peptide abundance CDPK1-M eluate)") +
  ylab("log10(peptide abundance CDPK1-G eluate)") +
  ggtitle("peptide abundance in eluate - CDPK1-G vs M")
Scatter_Elu_GroupedAbundances
ggplotly(Scatter_Elu_GroupedAbundances, tooltip = c("text"), width = 500, height = 500)


#Volcano plot. Avg.L2FC CDPK1GvsM vs -log10(pvalue) with non-linear signficance cutoff
significantThioP <- DF.elu %>% filter(pvaluexl2fc.norm.abund.ratio > 4 & phospho == "TRUE")
significantThioP.noPhospho <- DF.elu %>% filter(pvaluexl2fc.norm.abund.ratio > 4 & phospho == "FALSE")
notsignficant.phospho <- DF.elu %>% filter(phospho == "TRUE", pvaluexl2fc.norm.abund.ratio < 4)
#function for non-linear statistical cutoff
x=seq(from=-10, to=12, by=0.1)
xydata<-as.data.frame(x,y=abs(4/x))
nonlinearcutoff <-function(x) abs(4/x)

Volcano_Elu <- ggplot(DF.elu,
                        aes(x = log2.Avg.Norm.AbundRatio, y = -log10(pvalue),
                        text = paste0(Prot.ID, Unique.ID.3, "\n", Annotation, "\nPhenotype: ", phenotype,
                        "\nLOPIT: ", localisation.prediction))) +
  geom_point(color = cbPalette[1], shape = 16, size = 3, alpha = 0.5) +
  geom_point(data = notsignficant.phospho, color = cbPalette[3], shape = 16, size = 3, alpha = 0.5) +
  geom_point(data = significantThioP, color = cbPalette[2], shape = 16, size = 3) +
  #geom_text_repel(data = significantThioP,
  #aes(x = log10(Abundances..Grouped...light..eluate..1M), 
  #y = log10(Abundances..Grouped...heavy..eluate..dKu80),
  #label = Prot.ID)) +
  geom_point(data = significantThioP.noPhospho, color = cbPalette[4], shape = 16, size = 3) +
  stat_function(fun= nonlinearcutoff,color="black", size = 0.25,linetype="longdash") + #this line adds the non linear function cutoff
  aliceTheme +
  xlim(-10,12) +
  ylim(0, 4.25) +
  theme(aspect.ratio = 1) +
  xlab("Average L2FC CDPK1-G vs CDPK1-M") +
  ylab("-log10(pvalue)") +
  ggtitle("Volcano Eluate Peptides")
Volcano_Elu
ggplotly(Volcano_Elu, tooltip = c("text"), width = 500, height = 500)


#Plot protein level abundances for supernatant (whole proteome) with significant ThioP hits highlighted
#Doing this to show that overall protein abundances are same between CDPK1-G vs M, especially hits
#There are only 37 hits found in the supernatant whole proteome sample. representative enough.
signficantThioP.InWP <- filter(DF.Proteins, Prot.ID %in% significantThioP$Prot.ID)
significantThioP.noPhosphoInWP <- filter(DF.Proteins, Prot.ID %in% significantThioP.noPhospho$Prot.ID)
notsignificant.phospho.InWP <- filter(DF.Proteins, Prot.ID %in% notsignficant.phospho$Prot.ID)

scatter_wp_abundances <- ggplot(DF.Proteins,
                                  aes(x = log10(Abundances..Grouped...light..supernatant..1M),
                                      y= log10(Abundances..Grouped...heavy..supernatant..dKu80))) +
                        geom_point(color = cbPalette[1], shape = 16, size = 3, alpha = 0.5) +
                        geom_point(data = signficantThioP.InWP, color = cbPalette[2], shape = 16, size = 3) +
                        geom_point(data = significantThioP.noPhosphoInWP, color = cbPalette[4], shape = 16, size = 3) +
                        geom_point(data = notsignificant.phospho.InWP, color = cbPalette[3], shape = 16, size = 3)
scatter_wp_abundances


library(data.table)

fwrite(toxoinfo, "2022_05_17_GT1_ToxoInfo_annotaiton_phenotype_lopit_EBmicIP.csv")
fwrite(DF.elu, "2022_05_17_ThioP_DFelu.csv")


#### Heatmap ####

#set up heatmap matrix
set.seed(123)
heatmap.input <- DF.elu %>% select(log2.Norm.AbundRatio.R1,
                                   log2.Norm.AbundRatio.R2,
                                   log2.Norm.AbundRatio.R3)
heatmap.input <- as.matrix(heatmap.input)
rownames(heatmap.input) <- DF.elu$Unique.ID.3

#designate color ramping for heatmap
col_fun = colorRamp2(c(-13, 0, 13), c("#2166ac", "#F7F7F7", "#b2182b"))
col_fun(seq(-3, 3))

#Plot heatmap
#default row clustering is by euclidean distance
Heatmap(heatmap.input, 
        col = col_fun,
        clustering_distance_rows = "euclidean",
        column_order = sort(colnames(heatmap.input)))


#### GO analysis ####
#Not enough background genes to get statistical power
#GO terms obtained from ToxoDB genome wide
GO.Terms <- read.csv('517-GenesByText_Summary.csv', 
                     fill = T, stringsAsFactors = F, check.names = T)

#prepare background proteome to test enrichment against
#background proteome is every unique protein in DF.elu
#(1)Filter by unique Prot.IDs
#(2)Merge GO IDs from GO.terms
#(3)separate rows by GO IDs (this means that GO terms represent a gene once in the data set)
#(4)Unstack data set(to get list within a list for all GO IDs) to get all GO IDs in independent lists with associated gene IDs in a list

background.proteome <- left_join(DF.Peptides, DF.sup, by = "Master.Protein.Accessions") %>%
  select(Master.Protein.Accessions)
background.proteome <- background.proteome %>%
  distinct(Master.Protein.Accessions, .keep_all = TRUE) %>%
  rename(Gene.ID = Master.Protein.Accessions)

#Try to use genome for background
#background.proteome <- GO.Terms %>% select(Gene.ID) %>% distinct(Gene.ID, .keep_all = TRUE)


#merge GO IDs
background.proteome <- left_join(background.proteome, 
                                 GO.Terms %>% select(Gene.ID, Computed.GO.Function.IDs),
                                 by = "Gene.ID")
background.proteome[background.proteome == "N/A"] <- NA
#Separate rows by GO IDs
background.proteome <- background.proteome %>% 
  separate_rows(Computed.GO.Function.IDs, sep = ";") %>%
  drop_na(Computed.GO.Function.IDs) %>%
  select(Gene.ID, Computed.GO.Function.IDs)
#unstack makes each unique GO ID its own column and groups gene IDs into columns (and allows redundancies)
background.proteome <- unstack(background.proteome, Gene.ID~Computed.GO.Function.IDs)

#define your genesets (background proteome) to test against
genesets <- background.proteome

#Prepare test set
#Input test for GO analysis. unique proteins from 129 significant thioP hits
signatures <- significantThioP %>%
  distinct(Prot.ID, .keep_all = TRUE) %>%
  select(Prot.ID) %>%
  rename(Gene.ID = Prot.ID)
signatures <- left_join(signatures, GO.Terms %>% select(Gene.ID, Computed.GO.Function.IDs),
                        by = "Gene.ID")
signatures[signatures == "N/A"] <- NA
signatures <- signatures %>% drop_na(Computed.GO.Function.IDs)

signatures <- c(signatures$Gene.ID)

#Run hypergeomtric test
#Input (signature)
#Reference (genesets)
#background (# of gene IDs in background proteome that have a GO term)
#pval, keeps things less than the value you set it to
hyp_obj <- hypeR(signature = signatures, genesets = genesets, background = 1002, plotting = TRUE)

hyp_to_excel(hyp_obj,file_path="2022_07_17_ThioP_GOterm_HypeR_results.xlsx")  ##use this to export to excel. 


#### Import merged hyp_obj
hyp_obj_merge <- read.csv('2022_07_17_ThioP_GOterm_HypeR_results.csv', 
                          fill = T, stringsAsFactors = F, check.names = T) %>%
  rename(GO_ID = label)
GOterms.unlisted <- read.csv('GO_output_df.csv', 
                             fill = T, stringsAsFactors = F, check.names = T) %>%
  rename(GO_ID = Computed.GO.Function.IDs,
         GO_term = Computed.GO.Functions) %>%
  distinct(GO_ID, .keep_all = TRUE)
#left join to add on GO terms, join by GO IDs
hyp_obj_merge <- inner_join(hyp_obj_merge,
                            GOterms.unlisted %>% select(GO_ID, GO_term),
                            by = "GO_ID")

#### Filter merged hyp_obj 
#only want to plot things where >1 protein exists for a GO term 
hyp_obj_merge <- hyp_obj_merge %>% filter(pval < 0.05)
hyp_obj_merge$logpval <- -log10(hyp_obj_merge$pval)
hyp_obj_merge$fold_change <- (hyp_obj_merge$overlap/hyp_obj_merge$signature)/(hyp_obj_merge$geneset/hyp_obj_merge$background)

#### Plot GO plot
GOplot <- ggplot(hyp_obj_merge, 
                 aes(x = fold_change , y = reorder(GO_ID, fold_change),
                     fill = logpval, size = geneset, label = GO_term)) +
  geom_point(shape = 21, stroke = 0.5) +
  #scale_size(range = c(0,6)) +
  geom_point(aes(color = logpval), shape = 16) +
  scale_color_gradientn(colours = c("#fef0d9", "#800026"),
                        limits = c(1, 2)) +
  #geom_text() +
  #aliceTheme +
  geom_vline(xintercept = 1, linetype = "dotted", size = 1) +
  #xlim(0,11) +
  scale_x_continuous(limits = c(0,11), breaks = c(0,2,4,6,8,10)) +
  theme(panel.grid = element_blank(),
      panel.background = element_blank(),
      panel.border = element_rect(linetype = "solid", 
                            color = "black",
                            size = 1,
                            fill = NA))

ggsave("2022_07_18_ThioP_GOenrichment.pdf", width = 4.25, height = 3, units = "in")


#### hit classes ####
ThioP.hitclasses <- read.csv('2022_07_19_significantThioP_HitClasses.csv', 
                          fill = T, stringsAsFactors = F, check.names = T)

timecourse.hitclass.5 <- read.csv('Processed_Zap_Phosphopeptides-CDPK1-dep-marked-with-hit-classes.csv', 
                             fill = T, stringsAsFactors = F, check.names = T) %>% filter(hit.class == 5) %>%
  select(Master.Protein.Accessions, 
         Annotation, phenotype, 
         Unique.ID, 
         hit.class, 
         Sequence, 
         Modifications.in.Master.Proteins, 
         Modifications.in.Master.Proteins..all.Sites.,
         Modifications..all.possible.sites.)
